{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from datetime import datetime\n",
    "import pandas as pd # Pandas\n",
    "import pathlib # Built-in path manipulation library\n",
    "from urllib.error import HTTPError"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_dir = pathlib.Path('../data/')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Could not download data for recovered, US\n"
     ]
    }
   ],
   "source": [
    "url_template = (\"https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/\"\n",
    "                \"csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_%s_%s.csv\")\n",
    "\n",
    "dfs = {}\n",
    "for region in ['global', 'US']:\n",
    "    dfs[region] = {}\n",
    "    for kind in ['confirmed', 'deaths', 'recovered']:\n",
    "        url = url_template % (kind, region) # Create the full data URL\n",
    "        try:\n",
    "            df = pd.read_csv(url) # Download the data into a dataframe\n",
    "        except HTTPError:\n",
    "            print(\"Could not download data for %s, %s\" % (kind, region))\n",
    "        else:\n",
    "            if region == 'global':\n",
    "                df1 = df[df['Province/State'].isnull()].set_index('Country/Region') # Whole countries only,\n",
    "                                                                                   # use country name as index\n",
    "                df2 = df[df['Country/Region']=='China'].sum(axis=0, skipna=False).to_frame().T\n",
    "                df2['Country/Region'] = 'China'\n",
    "                df2 = df2.set_index('Country/Region')\n",
    "                df = pd.concat([df1, df2])\n",
    "            elif region == 'US':\n",
    "                df = df.set_index('Province_State') # Use state name as index\n",
    "            df = df[[x for x in df if '20' in x]] # Use only data columns\n",
    "            dfs[region][kind] = df # Add to dictionary of dataframes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Generate list of countries or states currently in the repository (ours, not JHU's)\n",
    "countries_states = [x.name.split('.')[0].split('_')[-1]\n",
    "                    for x in data_dir.iterdir()\n",
    "                    if 'covidtimeseries_' in str(x)]\n",
    "# Just the countries\n",
    "countries = [x for x in countries_states if len(x)>2]\n",
    "# Just the states\n",
    "states = [x for x in countries_states if len(x)==2]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "151\n"
     ]
    }
   ],
   "source": [
    "has_recoveries = dfs['global']['recovered'][dfs['global']['recovered'].max(axis=1)>0].index\n",
    "enough_cases = dfs['global']['confirmed'][dfs['global']['confirmed'].diff(axis=1).max(axis=1)>=5].index\n",
    "reports_deaths = dfs['global']['deaths'][dfs['global']['deaths'].max(axis=1)>0].index\n",
    "good_countries = list(has_recoveries.intersection(enough_cases).intersection(reports_deaths))\n",
    "print(len(good_countries))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "source = dfs['global']\n",
    "for country in good_countries:  # For each country\n",
    "    if country in source['confirmed'].index:  # If we have data in the downloaded JHU files for that country\n",
    "        df = pd.DataFrame(columns=['dates2', 'cum_cases', 'cum_deaths', 'cum_recover',\n",
    "                               'new_cases', 'new_deaths', 'new_recover', 'new_uninfected'])\n",
    "        df['dates2'] = source['confirmed'].columns\n",
    "        df['dates2'] = df['dates2'].apply(lambda x: datetime.strftime(datetime.strptime(x, '%m/%d/%y'), '%m/%d/%y'))\n",
    "        df['cum_cases'] = source['confirmed'].loc[country].values\n",
    "        df['cum_deaths'] = source['deaths'].loc[country].values\n",
    "        df['cum_recover'] = source['recovered'].loc[country].values\n",
    "        df[['new_cases', 'new_deaths', 'new_recover']] = \\\n",
    "            df[['cum_cases', 'cum_deaths', 'cum_recover']].diff()\n",
    "        df['new_uninfected'] = df['new_recover'] + df['new_deaths']\n",
    "        dfs[country] = df.set_index('dates2').fillna(0).astype(int)  # Fill NaN with 0 and convert to int\n",
    "        dfs[country].to_csv(data_dir /('covidtimeseries_%s.csv' % country))  # Overwrite old data\n",
    "    else:\n",
    "        print(\"No data for %s\" % country)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
